Biostat 212a Homework 4

Due Mar. 4, 2025 @ 11:59PM

Author

Jiaye Tian UID: 306541095

Published

March 4, 2025

1 ISL Exercise 8.4.3 (10pts)

p = seq(0, 1, 0.01)
gini = p * (1 - p) * 2
entropy = -(p * log(p) + (1 - p) * log(1 - p))
class.err = 1 - pmax(p, 1 - p)

matplot(p, cbind(gini, entropy, class.err), type = "l", 
        col = c("red", "green", "blue"),
        main = "Comparison of Gini Index, Entropy, and Classification Error")

legend("topright",legend=c("Gini","Entropy", "Class Error"), pch=19, 
       col=c("red", "green", "blue"))

2 ISL Exercise 8.4.4 (10pts)

3 ISL Exercise 8.4.5 (10pts)

Majority Vote → Red Averaging Probability → Green

4 ISL Lab 8.3. Boston data set (30pts)

Follow the machine learning workflow to train regression tree, random forest, and boosting methods for predicting medv. Evaluate out-of-sample performance on a test set.

# Load necessary libraries
library(MASS)         # Boston dataset
library(tree)         # Regression Tree
library(randomForest) # Random Forest
library(gbm)         # Boosting (Gradient Boosting)
# library(ISLR2)
# attach(Carseats)

# Set seed for reproducibility
set.seed(1)

# Split the dataset into training (50%) and testing (50%)
train <- sample(1:nrow(Boston), nrow(Boston) / 2)
Boston.test <- Boston[-train, "medv"]  # Extract true test set values
  1. Regression tree
# Train Regression Tree
tree.Boston <- tree(medv ~ ., Boston, subset = train)
summary(tree.Boston)

Regression tree:
tree(formula = medv ~ ., data = Boston, subset = train)
Variables actually used in tree construction:
[1] "rm"    "lstat" "crim"  "age"  
Number of terminal nodes:  7 
Residual mean deviance:  10.38 = 2555 / 246 
Distribution of residuals:
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-10.1800  -1.7770  -0.1775   0.0000   1.9230  16.5800 
# Plot the tree
plot(tree.Boston)
text(tree.Boston)

# Cross-validation for pruning
cv.Boston <- cv.tree(tree.Boston)
plot(cv.Boston$size, cv.Boston$dev, type = "b")

# Prune the tree with best = 5 (per original code)
prune.Boston <- prune.tree(tree.Boston, best = 5)
plot(prune.Boston)
text(prune.Boston, pretty = 5)

# Predict on test set and compute MSE
yhat.tree <- predict(tree.Boston, newdata = Boston[-train, ])
Boston.test <- Boston[-train, "medv"]
plot(yhat.tree, Boston.test)
abline(0, 1)

# Compute the test set MSE
tree.mse <- mean((yhat.tree - Boston.test)^2)
tree.mse
[1] 35.28688

In other words, the test set MSE associated with the regression tree is 35.29. The square root of the MSE is therefore around 5.941, indicating that this model leads to test predictions that are (on average) within approximately $5,941 of the true median home value for the census tract.

  1. RandomForest
# Train Bagging model (mtry = 12, per original code)
set.seed(1)
bag.Boston <- randomForest(medv ~ ., data = Boston, 
                           subset = train, mtry = 12, 
                           importance = TRUE)

# Test set prediction & MSE
yhat.bag <- predict(bag.Boston, newdata = Boston[-train, ])
bag.mse <- mean((yhat.bag - Boston.test)^2)
bag.mse
[1] 23.38773
# Train Bagging with ntree = 25 (per original code)
bag.Boston <- randomForest(medv ~ ., data = Boston, 
                           subset = train, mtry = 12, ntree = 25)
yhat.bag <- predict(bag.Boston, newdata = Boston[-train, ])
bag.mse2 <- mean((yhat.bag - Boston.test)^2)
bag.mse2
[1] 25.19144
# Train Random Forest (mtry = 6, per original code)
set.seed(1)
rf.Boston <- randomForest(medv ~ ., data = Boston,
                          subset = train, mtry = 6, importance = TRUE)

# Test set prediction & MSE
yhat.rf <- predict(rf.Boston, newdata = Boston[-train, ])
rf.mse <- mean((yhat.rf - Boston.test)^2)
rf.mse
[1] 19.62021
# Feature importance & visualization
importance(rf.Boston)
          %IncMSE IncNodePurity
crim    16.697017    1076.08786
zn       3.625784      88.35342
indus    4.968621     609.53356
chas     1.061432      52.21793
nox     13.518179     709.87339
rm      32.343305    7857.65451
age     13.272498     612.21424
dis      9.032477     714.94674
rad      2.878434      95.80598
tax      9.118801     364.92479
ptratio  8.467062     823.93341
black    7.579482     275.62272
lstat   27.129817    6027.63740
varImpPlot(rf.Boston)

  1. Boosting
# Train Boosting model (default shrinkage = 0.1)
set.seed(1)
boost.Boston <- gbm(medv ~ ., data = Boston[train, ],
distribution = "gaussian", n.trees = 5000, interaction.depth = 4)

# Visualize variable importance
plot(boost.Boston, i = "rm")

plot(boost.Boston, i = "lstat")

# Test set prediction & MSE
yhat.boost <- predict(boost.Boston, newdata = Boston[-train, ], n.trees = 5000)
boost.mse <- mean((yhat.boost - Boston.test)^2)
boost.mse
[1] 18.84709
# Train Boosting with shrinkage = 0.2 (per original code)
boost.Boston <- gbm(medv ~ ., data = Boston[train, ], 
                    distribution = "gaussian", n.trees = 5000, 
                    interaction.depth = 4, shrinkage = 0.2, verbose = F)

# Test set prediction & MSE
yhat.boost <- predict(boost.Boston, newdata = Boston[-train, ], n.trees = 5000)
boost.mse2 <- mean((yhat.boost - Boston.test)^2)
boost.mse2
[1] 18.33455
# Compare MSE results
mse_results <- data.frame(
  Model = c("Regression Tree", "Bagging (mtry=12, ntree=500)", 
            "Bagging (mtry=12, ntree=25)", "Random Forest", 
            "Boosting (shrinkage=0.1)", "Boosting (shrinkage=0.2)"),
  MSE = c(tree.mse, bag.mse, bag.mse2, rf.mse, boost.mse, boost.mse2)
)
print(mse_results)
                         Model      MSE
1              Regression Tree 35.28688
2 Bagging (mtry=12, ntree=500) 23.38773
3  Bagging (mtry=12, ntree=25) 25.19144
4                Random Forest 19.62021
5     Boosting (shrinkage=0.1) 18.84709
6     Boosting (shrinkage=0.2) 18.33455
# Plot MSE comparison
barplot(mse_results$MSE, names.arg = mse_results$Model, 
        col = c("red", "blue", "blue", "green", "orange", "orange"), 
        main = "MSE Comparison", ylab = "Mean Squared Error", las = 2, cex.names = 0.8)

5 ISL Lab 8.3 Carseats data set (30pts)

Follow the machine learning workflow to train classification tree, random forest, and boosting methods for classifying Sales <= 8 versus Sales > 8. Evaluate out-of-sample performance on a test set.

# Load required libraries
library(tree)          # Classification Tree
library(randomForest)  # Random Forest
library(gbm)           # Gradient Boosting
library(caret)         # Confusion Matrix
library(ISLR2)

# Set seed for reproducibility
set.seed(2)

# Convert Sales into a binary variable
Carseats$High <- factor(ifelse(Carseats$Sales > 8, "Yes", "No"))

# Remove the Sales column
Carseats <- subset(Carseats, select = -Sales)

# Train-test split (50-50)
train <- sample(1:nrow(Carseats), nrow(Carseats) / 2)
Carseats.train <- Carseats[train, ]
Carseats.test <- Carseats[-train, ]
High.test <- Carseats.test$High

# ---- Classification Tree ----
tree.carseats <- tree(High ~ ., data = Carseats.train)
summary(tree.carseats)

Classification tree:
tree(formula = High ~ ., data = Carseats.train)
Variables actually used in tree construction:
[1] "Price"       "Population"  "ShelveLoc"   "Age"         "Education"  
[6] "CompPrice"   "Advertising" "Income"      "US"         
Number of terminal nodes:  21 
Residual mean deviance:  0.5543 = 99.22 / 179 
Misclassification error rate: 0.115 = 23 / 200 
plot(tree.carseats)
text(tree.carseats, pretty = 0)

# Test set prediction
tree.pred <- predict(tree.carseats, Carseats.test, type = "class")
confusion_matrix_tree <- table(tree.pred, High.test)
tree_accuracy <- sum(diag(confusion_matrix_tree)) / sum(confusion_matrix_tree)

# ---- Pruning ----
set.seed(7)
cv.carseats <- cv.tree(tree.carseats, FUN = prune.misclass)
prune_size <- cv.carseats$size[which.min(cv.carseats$dev)]  # Select pruning size with minimum error
prune.carseats <- prune.misclass(tree.carseats, best = prune_size)

plot(prune.carseats)
text(prune.carseats, pretty = 0)

# Evaluate pruned tree
tree.pred.pruned <- predict(prune.carseats, Carseats.test, type = "class")
confusion_matrix_pruned <- table(tree.pred.pruned, High.test)
pruned_accuracy <- sum(diag(confusion_matrix_pruned)) / sum(confusion_matrix_pruned)

# ---- Random Forest ----
p <- ncol(Carseats.train) - 1  # Number of features excluding 'High'

# Tune mtry using tuneRF()
set.seed(1)
best_mtry <- tuneRF(Carseats.train[-ncol(Carseats.train)], Carseats.train$High, 
                     stepFactor = 1.5, improve = 0.01, trace = FALSE)
-0.245283 0.01 
-0.09433962 0.01 

mtry_best <- best_mtry[which.min(best_mtry[,2]), 1]

# Train Random Forest
set.seed(1)
rf.carseats <- randomForest(High ~ ., data = Carseats.train, ntree = 500, mtry = mtry_best, importance = TRUE)

# Test set prediction
rf.pred <- predict(rf.carseats, Carseats.test)
confusion_matrix_rf <- table(rf.pred, High.test)
rf_accuracy <- sum(diag(confusion_matrix_rf)) / sum(confusion_matrix_rf)

# Variable Importance
importance(rf.carseats)
                    No         Yes MeanDecreaseAccuracy MeanDecreaseGini
CompPrice    2.1304913  2.21583500            3.0233624        11.332341
Income      -1.2187382  0.08332858           -0.9127061        11.247612
Advertising  2.7107740  9.28221142            8.0200032        11.539777
Population  -2.5279538 -1.96748887           -3.4496266         9.515984
Price       17.6102844 16.18029714           22.2205678        21.853751
ShelveLoc   11.8799961 11.50873822           15.0797611        10.106813
Age          3.7038805  6.83255965            6.9440472        11.738663
Education   -2.9218854 -0.16331173           -2.5039109         6.031744
Urban       -0.9337001  0.16405241           -0.4965739         1.114652
US          -0.4330561  1.48555372            0.7195542         1.127466
varImpPlot(rf.carseats)

# ---- Boosting ----
boost.train <- Carseats.train
boost.test <- Carseats.test
boost.train$High <- ifelse(boost.train$High == "Yes", 1, 0)  # Convert to 0/1 for Boosting

set.seed(1)
boost.carseats <- gbm(High ~ ., data = boost.train, distribution = "bernoulli",
                      n.trees = 5000, interaction.depth = 4, shrinkage = 0.01, verbose = FALSE)

# Prediction
boost.prob <- predict(boost.carseats, newdata = boost.test, n.trees = 5000, type = "response")
boost.pred <- factor(ifelse(boost.prob > 0.5, "Yes", "No"), levels = levels(High.test))

confusion_matrix_boost <- table(boost.pred, High.test)
boost_accuracy <- sum(diag(confusion_matrix_boost)) / sum(confusion_matrix_boost)

# Variable Importance
summary(boost.carseats)

                    var    rel.inf
Price             Price 24.3344414
CompPrice     CompPrice 14.3555860
ShelveLoc     ShelveLoc 13.4625168
Age                 Age 12.5211889
Advertising Advertising 11.2205000
Income           Income 11.1522623
Population   Population  8.0522931
Education     Education  3.4101404
Urban             Urban  0.8634069
US                   US  0.6276643
# Test Boosting with different shrinkage (0.2)
set.seed(1)
boost.carseats2 <- gbm(High ~ ., data = boost.train, distribution = "bernoulli",
                        n.trees = 5000, interaction.depth = 4, shrinkage = 0.2, verbose = FALSE)

boost.prob2 <- predict(boost.carseats2, newdata = boost.test, n.trees = 5000, type = "response")
boost.pred2 <- factor(ifelse(boost.prob2 > 0.5, "Yes", "No"), levels = levels(High.test))

confusion_matrix_boost2 <- table(boost.pred2, High.test)
boost_accuracy2 <- sum(diag(confusion_matrix_boost2)) / sum(confusion_matrix_boost2)

# ---- Confusion Matrix Visualization ----
cat("Classification Tree:\n")
Classification Tree:
print(confusionMatrix(as.factor(tree.pred), High.test))
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  104  33
       Yes  13  50
                                          
               Accuracy : 0.77            
                 95% CI : (0.7054, 0.8264)
    No Information Rate : 0.585           
    P-Value [Acc > NIR] : 2.938e-08       
                                          
                  Kappa : 0.5091          
                                          
 Mcnemar's Test P-Value : 0.005088        
                                          
            Sensitivity : 0.8889          
            Specificity : 0.6024          
         Pos Pred Value : 0.7591          
         Neg Pred Value : 0.7937          
             Prevalence : 0.5850          
         Detection Rate : 0.5200          
   Detection Prevalence : 0.6850          
      Balanced Accuracy : 0.7456          
                                          
       'Positive' Class : No              
                                          
cat("\nPruned Tree:\n")

Pruned Tree:
print(confusionMatrix(as.factor(tree.pred.pruned), High.test))
Confusion Matrix and Statistics

          Reference
Prediction No Yes
       No  97  25
       Yes 20  58
                                          
               Accuracy : 0.775           
                 95% CI : (0.7108, 0.8309)
    No Information Rate : 0.585           
    P-Value [Acc > NIR] : 1.206e-08       
                                          
                  Kappa : 0.5325          
                                          
 Mcnemar's Test P-Value : 0.551           
                                          
            Sensitivity : 0.8291          
            Specificity : 0.6988          
         Pos Pred Value : 0.7951          
         Neg Pred Value : 0.7436          
             Prevalence : 0.5850          
         Detection Rate : 0.4850          
   Detection Prevalence : 0.6100          
      Balanced Accuracy : 0.7639          
                                          
       'Positive' Class : No              
                                          
cat("\nRandom Forest:\n")

Random Forest:
print(confusionMatrix(as.factor(rf.pred), High.test))
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  110  24
       Yes   7  59
                                          
               Accuracy : 0.845           
                 95% CI : (0.7873, 0.8922)
    No Information Rate : 0.585           
    P-Value [Acc > NIR] : 1.939e-15       
                                          
                  Kappa : 0.671           
                                          
 Mcnemar's Test P-Value : 0.004057        
                                          
            Sensitivity : 0.9402          
            Specificity : 0.7108          
         Pos Pred Value : 0.8209          
         Neg Pred Value : 0.8939          
             Prevalence : 0.5850          
         Detection Rate : 0.5500          
   Detection Prevalence : 0.6700          
      Balanced Accuracy : 0.8255          
                                          
       'Positive' Class : No              
                                          
cat("\nBoosting (shrinkage=0.01):\n")

Boosting (shrinkage=0.01):
print(confusionMatrix(as.factor(boost.pred), High.test))
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  109  16
       Yes   8  67
                                          
               Accuracy : 0.88            
                 95% CI : (0.8267, 0.9216)
    No Information Rate : 0.585           
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.7493          
                                          
 Mcnemar's Test P-Value : 0.153           
                                          
            Sensitivity : 0.9316          
            Specificity : 0.8072          
         Pos Pred Value : 0.8720          
         Neg Pred Value : 0.8933          
             Prevalence : 0.5850          
         Detection Rate : 0.5450          
   Detection Prevalence : 0.6250          
      Balanced Accuracy : 0.8694          
                                          
       'Positive' Class : No              
                                          
cat("\nBoosting (shrinkage=0.2):\n")

Boosting (shrinkage=0.2):
print(confusionMatrix(as.factor(boost.pred2), High.test))
Confusion Matrix and Statistics

          Reference
Prediction  No Yes
       No  111  36
       Yes   6  47
                                          
               Accuracy : 0.79            
                 95% CI : (0.7269, 0.8443)
    No Information Rate : 0.585           
    P-Value [Acc > NIR] : 7.046e-10       
                                          
                  Kappa : 0.5435          
                                          
 Mcnemar's Test P-Value : 7.648e-06       
                                          
            Sensitivity : 0.9487          
            Specificity : 0.5663          
         Pos Pred Value : 0.7551          
         Neg Pred Value : 0.8868          
             Prevalence : 0.5850          
         Detection Rate : 0.5550          
   Detection Prevalence : 0.7350          
      Balanced Accuracy : 0.7575          
                                          
       'Positive' Class : No              
                                          
# ---- Accuracy Comparison ----
accuracy_results <- data.frame(
  Model = c("Classification Tree", "Pruned Tree", "Random Forest", "Boosting (0.01)", "Boosting (0.2)"),
  Accuracy = c(tree_accuracy, pruned_accuracy, rf_accuracy, boost_accuracy, boost_accuracy2)
)

print(accuracy_results)
                Model Accuracy
1 Classification Tree    0.770
2         Pruned Tree    0.775
3       Random Forest    0.845
4     Boosting (0.01)    0.880
5      Boosting (0.2)    0.790
barplot(accuracy_results$Accuracy, names.arg = accuracy_results$Model, 
        col = c("red", "blue", "green", "orange", "purple"), 
        main = "Accuracy Comparison", ylab = "Accuracy", las = 2, cex.names = 0.8)